# REPLICATION
# Place-Based Campaigning: The Political Impact of Real Grassroots Mobilization
# Daniel Bischof and Thomas Kurer
# Journal of Politics

# Data Collection: Webscraping Meetup

# NOTE TO USERS:
# Meetup has deprecated the REST API that is used in this code, which in turn heavily relies on the meetupr package.
# This change means that the meetupr package is no longer functional.
# The code provided here thus serves research transparency purposes only. 
# We provide the full data resulting from running this code in December 2018 in a separate csv file (NAME).


# Preparation
dev.off()
cat("\014")  
# globals
options(scipen=999)

# packages
library(tidyverse)
library(readstata13)
library(texreg)

# devtools::install_github("rladies/meetupr") # NOTE: package is not currently functional
library(meetupr)

# Meeup API Key
Sys.setenv(MEETUP_KEY = "inputyourkey") 


## Adjust meetup functions ----

my_find_groups <- function (text = NULL, radius = "global", api_key = NULL) 
{
    api_method <- "find/groups"
    res <- meetupr:::.fetch_results(api_method = api_method, api_key = api_key, 
        text = text, radius = radius)
    tibble::tibble(id = purrr::map_int(res, "id", .null = NA), 
                   name = purrr::map_chr(res, "name", .null = NA), 
                   urlname = purrr::map_chr(res, "urlname", .null = NA), 
                   description = purrr::map_chr(res, "description", .null = NA),
                   created = meetupr:::.date_helper(purrr::map_dbl(res, "created", .null = NA)), 
                   members = purrr::map_int(res, "members", .null = NA), 
                   status = purrr::map_chr(res, "status", .null = NA), 
                   organizer = purrr::map_chr(res, c("organizer", "name"), .null = NA), 
                   lat = purrr::map_dbl(res, "lat", .null = NA), 
                   lon = purrr::map_dbl(res, "lon", .null = NA), 
                   city = purrr::map_chr(res, "city", .null = NA), 
                   state = purrr::map_chr(res, "state", .null = NA), 
                   country = purrr::map_chr(res, "country", .null = NA), 
                   timezone = purrr::map_chr(res, "timezone", .null = NA), 
                   organizer_id = purrr::map_int(res, c("organizer", "id"), .null = NA), 
                   organizer_name = purrr::map_chr(res, c("organizer", "name"), .null = NA), 
                   category_id = purrr::map_int(res, c("category", "id"), .null = NA), 
                   category_name = purrr::map_chr(res, c("category", "name"), .null = NA)
        #, resource = res
        )

}


my_get_events <- function (urlname, event_status = "upcoming", api_key = NULL) 
{
    if (!is.null(event_status) && !event_status %in% c("cancelled", 
        "draft", "past", "proposed", "suggested", "upcoming")) {
        stop(sprintf("Event status %s not allowed", event_status))
    }
    if (length(event_status) > 1) {
        event_status <- paste(event_status, collapse = ",")
    }
    api_method <- paste0(urlname, "/events")
    res <- meetupr:::.fetch_results(api_method, api_key, event_status)
    tibble::tibble(event_id = purrr::map_chr(res, "id", .null = NA), 
                    event_name = purrr::map_chr(res, "name", .null = NA), 
                    event_created = meetupr:::.date_helper(purrr::map_dbl(res, "created", .null = NA)), 
                    status = purrr::map_chr(res, "status", .null = NA), 
                    time = meetupr:::.date_helper(purrr::map_dbl(res, "time", .null = NA)), 
                   local_date = as.Date(purrr::map_chr(res, "local_date", .null = NA)), 
                   local_time = purrr::map_chr(res, "local_time", .null = NA), 
                   waitlist_count = purrr::map_int(res, "waitlist_count", .null = NA), 
                   yes_rsvp_count = purrr::map_int(res, "yes_rsvp_count", .null = NA), 
                   venue_id = purrr::map_int(res, c("venue", "id"), .null = NA), 
                   venue_name = purrr::map_chr(res, c("venue", "name"), .null = NA), 
                   venue_lat = purrr::map_dbl(res, c("venue", "lat"), .null = NA), 
                   venue_lon = purrr::map_dbl(res, c("venue", "lon"), .null = NA), 
                   venue_address_1 = purrr::map_chr(res, c("venue", "address_1"), .null = NA), 
                   venue_city = purrr::map_chr(res, c("venue", "city"), .null = NA), 
                   venue_state = purrr::map_chr(res, c("venue", "state"), .null = NA), 
                   venue_zip = purrr::map_chr(res, c("venue", "zip"), .null = NA), 
                   venue_country = purrr::map_chr(res, c("venue", "country"), .null = NA), 
                   event_description = purrr::map_chr(res, c("description"), .null = NA), 
                   link = purrr::map_chr(res, c("link"), .null = NA), 
                   group_id = purrr::map_chr(res, c("group", "id"), .null = NA),
                   group_created = meetupr:::.date_helper(purrr::map_dbl(res, c("group", "created"), .null = NA)),
                   group_name = purrr::map_chr(res, c("group", "name"), .null = NA),
                   group_location = purrr::map_chr(res, c("group", "localized_location"), .null = NA),
                   group_lat = purrr::map_chr(res, c("group", "lat"), .null = NA),
                   group_lon = purrr::map_chr(res, c("group", "lon"), .null = NA),
                   group_state = purrr::map_chr(res, c("group", "state"), .null = NA),
                   group_country = purrr::map_chr(res, c("group", "country"), .null = NA),
                   group_url = purrr::map_chr(res, c("group", "urlname"), .null = NA)
                   #,
                   #resource = res # uncomment if you want full list of all available attributes
                   )
}


## Create complete list of all M5S group in meetup ----

# twofold approach

# 1. manual search via meetup topics
# collect all groups subscribing to topic "movimento 5 stelle" and "Beppe Grillo", respectively.
# extract urls via tools.buzzstream.com
# ==> URLsWithDomains_Movimento5Stelle_20181203.csv
# ==> URLsWithDomains_BeppeGrillo_20181203.csv

# 2. full-text search via meetupr find_groups function with same keywords. Finds more groups that have not subscribed to topic.

# final step: drop duplicates, drop irrelevant groups (e.g. italian language groups abroad that chose topic m5s to attract people)


# (1) urls topic Movimento 5 Stelle ----

urls <- read.csv("URLsWithDomains_Movimento5Stelle_20181203.csv", stringsAsFactors = FALSE)
urls$group <- substr(urls$URL, 23, nchar(urls$URL))

meetups <- filter(urls, !grepl("cities", group))
rowstart <-  which(grepl("Grilli-Milano", meetups$group))
rowend <-  which(grepl("Amici-di-Beppe-Grillo-Valbormida", meetups$group))

meetups <- as.data.frame(meetups[rowstart:rowend,4])
meetups[,1] <- gsub("/", "", meetups[,1]) 

groups_topic_m5s <- meetups$`meetups[rowstart:rowend, 4]`

# urls topic Beppe Grillo

urlsBG <- read.csv("URLsWithDomains_BeppeGrillo_20181203.csv", stringsAsFactors = FALSE)
urlsBG$groupBG <- substr(urlsBG$URL, 23, nchar(urlsBG$URL))

meetupsBG <- dplyr::filter(urlsBG, !grepl("cities", groupBG))
rowstartBG <-  which(grepl("Gli-amici-di-Beppe-Grillo-di-Napoli", meetupsBG$groupBG))
rowendBG <-  which(grepl("sbaglio", meetupsBG$groupBG))

meetupsBG <- as.data.frame(meetupsBG[rowstartBG:rowendBG,4])
meetupsBG[,1] <- gsub("/", "", meetupsBG[,1])

groups_topic_abg_full <- meetupsBG$`meetupsBG[rowstartBG:rowendBG, 4]`

# only those not already covered by M5S topic (many are cross-listed)
groups_topic_abg <- setdiff(groups_topic_abg_full, groups_topic_m5s)

# (2) alternative via full text search ----

ft_m5s <- my_find_groups("movimento 5 stelle")
ft_abg <- my_find_groups("amici di beppe grillo")

groups_ft_m5s <- ft_m5s$urlname
groups_ft_abg <- ft_abg$urlname

# define complete list of groups

core <- Reduce(intersect, list(groups_topic_abg_full, groups_topic_m5s, groups_ft_abg, groups_ft_m5s))
ft_abg_only <- base::setdiff(groups_ft_abg, core)
ft_m5s_only <- base::setdiff(groups_ft_m5s, core)

# find more groups by topic rather than full-text search?
m5s_topic_only <- base::setdiff(groups_topic_m5s, groups_ft_m5s)
abg_topic_only <- base::setdiff(groups_topic_abg, groups_ft_abg)

groups <- Reduce(union, list(groups_topic_abg_full, groups_topic_m5s, groups_ft_abg, groups_ft_m5s))

keywords <- c("m5s", "movimento", "stelle", "grilli", "grillo", "attivisti")
groups_filter <- groups
for (key in keywords) {
  groups_filter <- groups_filter[!grepl(key, groups_filter, ignore.case = TRUE)]
}

# (3) drop irrelevant cases (false positives) ----

# check ambiguous groups (unclear if really related to m5s)
# get description of unclear groups

ft_m5s_filter <- ft_m5s %>% filter(urlname %in% groups_filter) %>% select(urlname, name, description)
# search name and description for keywords.

keywords2 <- c("m5s", "movimento", "stelle", "grilli", "grillo")

ft_m5s_filter_bad <- ft_m5s_filter

# filter in name of group first
for (key in keywords2) {
  ft_m5s_filter_bad <- ft_m5s_filter_bad %>% filter(!grepl(key, name, ignore.case=TRUE))
}

# then for remaining groups also in description
for (key in keywords2) {
  ft_m5s_filter_bad <- ft_m5s_filter_bad %>% filter(!grepl(key, description, ignore.case=TRUE))
}

# 30 groups which are not m5s groups in the narrow sense (no mentioning).

ft_abg_filter <- ft_abg %>% filter(urlname %in% groups_filter) %>% select(urlname, name, description)
# search name and description for keywords.

keywords2 <- c("m5s", "movimento", "stelle", "grilli", "grillo")

ft_abg_filter_bad <- ft_abg_filter

# filter in name of group first
for (key in keywords2) {
  ft_abg_filter_bad <- ft_abg_filter_bad %>% filter(!grepl(key, name, ignore.case=TRUE))
}

# then for remaining groups also in description
for (key in keywords2) {
  ft_abg_filter_bad <- ft_abg_filter_bad %>% filter(!grepl(key, description, ignore.case=TRUE))
}

# 6 groups which are not m5s groups in the narrow sense (no mentioning).

groups_bad <- union(ft_abg_filter_bad$urlname, ft_m5s_filter_bad$urlname)
# 30 in total

# create clean group list (only true positives)

groups_clean <- setdiff(groups, groups_bad)

# (5) Create Full List of Events (for all groups in cleaned list of groups) ----

eventslist = list() 

for (groupname in groups_clean) {
    tryCatch({ 
  urlname <- groupname
  events <- my_get_events(urlname, event_status=c("past", "proposed", "upcoming"))
  Sys.sleep(1)
  events$groupname <- groupname
  eventslist[[groupname]] <- events
    }, error=function(e){message('There is an error with group: ',groupname, ' The error message is: ',e)}) 
}

allevents = do.call(rbind, eventslist)

# find groups with zero output (-> produces error message)
# either not existing or zero events (content might be partly private)
# some might be related to server answer problem

# add modena case, could not be parsed because of proposed events

modena <- my_get_events("grillimodenesi", event_status=c("past", "upcoming"))

allevents <- allevents %>% select(-groupname) # unnecessary, same as group_url
events <- rbind(allevents, modena)

# (6) drop duplicates ----

table(duplicated(events))
# 2912 duplicates

events_unique <- unique(events)

# (7) write events ----
write.csv(events_unique, "events_20181206.csv", row.names=FALSE, fileEncoding = "UTF-8")


# (8) Create Full List of Groups ----

groups_output <- unique(events$group_url)
missing_groups <- setdiff(groups_clean, groups_output)
missing_groups <- missing_groups[-1]
# 103 groups that had no events so far.
# but all of them are valid, groups with real members (name, photo, ...)

# Join existing group info from above

groups_ft <- rbind(ft_m5s, ft_abg)
groups_ft <- unique(groups_ft) # union of the two full-text searches
groups_ft <- groups_ft %>% filter(urlname %in% groups_clean)
groups_ft$source <- "ft_r1"

groups_topic_url <- setdiff(groups_clean, groups_ft$urlname)

length(intersect(groups_topic_url, events$group_url))
# extract groupname for ft search (rather than url)

groups_topic_name <- events %>% filter(group_url %in% groups_topic_url) %>% select(group_url, group_name) %>%
  group_by(group_url) %>% filter(row_number(group_url) == 1)

groups_topic_name <- groups_topic_name$group_name

# parse group info

grouplist = list() 

for (groupname in groups_topic_name) {
    tryCatch({ 
  fullname <- groupname
  group <- my_find_groups(fullname)
  Sys.sleep(1)
  grouplist[[groupname]] <- group
    }, error=function(e){message('There is an error with group: ',groupname, ' The error message is: ',e)}) 
}

groups_topic = do.call(rbind, grouplist)


groups_topic <- groups_topic %>% filter(name %in% groups_topic_name)

groups_topic <- unique(groups_topic)

nrow(groups_topic)
# 66, 2 could not be parsed.
canino <- events %>% filter(group_url=="Grillini-Canino") %>% filter(row_number(group_url) == 1) %>% select(starts_with("group"))

caninogroup <- data.frame(canino$group_id, canino$group_name, canino$group_url, NA, canino$group_created, 51, "active", NA, canino$group_lat, canino$group_lon, canino$group_location, NA, "IT", NA, NA, NA, NA, NA)
names(caninogroup)<-names(groups_topic)

storico <- events %>% filter(group_url=="ff783d32-070b-436f-b1e9-5b093a8647dd") %>% filter(row_number(group_url) == 1) %>% select(starts_with("group"))

storicogroup <- data.frame(storico$group_id, storico$group_name, storico$group_url, NA, storico$group_created, 279, "active", NA, storico$group_lat, storico$group_lon, storico$group_location, NA, "IT", NA, NA, NA, NA, NA)
names(storicogroup)<-names(groups_topic)

groups_topic <- rbind(groups_topic, caninogroup)
groups_topic <- rbind(groups_topic, storicogroup)

# 3 topic groups that are not in event file (no events)

groups_topic_missing <- setdiff(groups_topic_url, events$group_url)

grouplist2 = list() 

for (groupname in groups_topic_missing) {
    tryCatch({ 
  fullname <- groupname
  group <- my_find_groups(fullname)
  Sys.sleep(1)
  grouplist2[[groupname]] <- group
    }, error=function(e){message('There is an error with group: ',groupname, ' The error message is: ',e)}) 
}

groups_topic_missing = do.call(rbind, grouplist2)

groups_topic <- rbind(groups_topic, groups_topic_missing)

groups_topic$source <- "topic"

groups <- rbind(groups_ft, groups_topic)
# n=1147


groups$name <- gsub(",", " ", groups$name)
groups$description <- gsub(",", " ", groups$description)

# corrupt description
groups$description[groups$id==7177692] <- NA


# (9) write groups ----
write.csv(groups, "groups_20181206.csv", row.names=FALSE, fileEncoding = "UTF-8")
